/*
 * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
 * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 */
/*
 * Copyright 1999-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * $Id: EncodingInfo.java,v 1.2.4.2 2005/09/15 12:01:24 suresh_emailid Exp $
 */
package com.sun.org.apache.xml.internal.serializer;

import java.io.UnsupportedEncodingException;

/**
 * Holds information about a given encoding, which is the Java name for the encoding, the equivalent
 * ISO name. <p> An object of this type has two useful methods
 * <pre>
 * isInEncoding(char ch);
 * </pre>
 * which can be called if the character is not the high one in a surrogate pair and:
 * <pre>
 * isInEncoding(char high, char low);
 * </pre>
 * which can be called if the two characters from a high/low surrogate pair. <p> An EncodingInfo
 * object is a node in a binary search tree. Such a node will answer if a character is in the
 * encoding, and do so for a given range of unicode values (<code>m_first</code> to
 * <code>m_last</code>). It will handle a certain range of values explicitly
 * (<code>m_explFirst</code> to <code>m_explLast</code>). If the unicode point is before that
 * explicit range, that is it is in the range <code>m_first <= value < m_explFirst</code>, then it
 * will delegate to another EncodingInfo object for The root of such a tree, m_before.  Likewise for
 * values in the range <code>m_explLast < value <= m_last</code>, but delgating to
 * <code>m_after</code> <p> Actually figuring out if a code point is in the encoding is expensive.
 * So the purpose of this tree is to cache such determinations, and not to build the entire tree of
 * information at the start, but only build up as much of the tree as is used during the
 * transformation. <p> This Class is not a public API, and should only be used internally within the
 * serializer.
 *
 * @xsl.usage internal
 */
public final class EncodingInfo extends Object {

  /**
   * The ISO encoding name.
   */
  final String name;

  /**
   * The name used by the Java convertor.
   */
  final String javaName;

  /**
   * A helper object that we can ask if a
   * single char, or a surrogate UTF-16 pair
   * of chars that form a single character,
   * is in this encoding.
   */
  private InEncoding m_encoding;

  /**
   * This is not a public API. It returns true if the
   * char in question is in the encoding.
   *
   * @param ch the char in question.
   * @xsl.usage internal
   */
  public boolean isInEncoding(char ch) {
    if (m_encoding == null) {
      m_encoding = new EncodingImpl();

      // One could put alternate logic in here to
      // instantiate another object that implements the
      // InEncoding interface. For example if the JRE is 1.4 or up
      // we could have an object that uses JRE 1.4 methods
    }
    return m_encoding.isInEncoding(ch);
  }

  /**
   * This is not a public API. It returns true if the
   * character formed by the high/low pair is in the encoding.
   *
   * @param high a char that the a high char of a high/low surrogate pair.
   * @param low a char that is the low char of a high/low surrogate pair.
   * @xsl.usage internal
   */
  public boolean isInEncoding(char high, char low) {
    if (m_encoding == null) {
      m_encoding = new EncodingImpl();

      // One could put alternate logic in here to
      // instantiate another object that implements the
      // InEncoding interface. For example if the JRE is 1.4 or up
      // we could have an object that uses JRE 1.4 methods
    }
    return m_encoding.isInEncoding(high, low);
  }

  /**
   * Create an EncodingInfo object based on the ISO name and Java name.
   * If both parameters are null any character will be considered to
   * be in the encoding. This is useful for when the serializer is in
   * temporary output state, and has no assciated encoding.
   *
   * @param name reference to the ISO name.
   * @param javaName reference to the Java encoding name.
   */
  public EncodingInfo(String name, String javaName) {

    this.name = name;
    this.javaName = javaName;
  }


  /**
   * A simple interface to isolate the implementation.
   * We could also use some new JRE 1.4 methods in another implementation
   * provided we use reflection with them.
   * <p>
   * This interface is not a public API,
   * and should only be used internally within the serializer.
   *
   * @xsl.usage internal
   */
  private interface InEncoding {

    /**
     * Returns true if the char is in the encoding
     */
    public boolean isInEncoding(char ch);

    /**
     * Returns true if the high/low surrogate pair forms
     * a character that is in the encoding.
     */
    public boolean isInEncoding(char high, char low);
  }

  /**
   * This class implements the
   */
  private class EncodingImpl implements InEncoding {


    public boolean isInEncoding(char ch1) {
      final boolean ret;
      int codePoint = Encodings.toCodePoint(ch1);
      if (codePoint < m_explFirst) {
        // The unicode value is before the range
        // that we explictly manage, so we delegate the answer.

        // If we don't have an m_before object to delegate to, make one.
        if (m_before == null) {
          m_before =
              new EncodingImpl(
                  m_encoding,
                  m_first,
                  m_explFirst - 1,
                  codePoint);
        }
        ret = m_before.isInEncoding(ch1);
      } else if (m_explLast < codePoint) {
        // The unicode value is after the range
        // that we explictly manage, so we delegate the answer.

        // If we don't have an m_after object to delegate to, make one.
        if (m_after == null) {
          m_after =
              new EncodingImpl(
                  m_encoding,
                  m_explLast + 1,
                  m_last,
                  codePoint);
        }
        ret = m_after.isInEncoding(ch1);
      } else {
        // The unicode value is in the range we explitly handle
        final int idx = codePoint - m_explFirst;

        // If we already know the answer, just return it.
        if (m_alreadyKnown[idx]) {
          ret = m_isInEncoding[idx];
        } else {
          // We don't know the answer, so find out,
          // which may be expensive, then cache the answer
          ret = inEncoding(ch1, m_encoding);
          m_alreadyKnown[idx] = true;
          m_isInEncoding[idx] = ret;
        }
      }
      return ret;
    }

    public boolean isInEncoding(char high, char low) {
      final boolean ret;
      int codePoint = Encodings.toCodePoint(high, low);
      if (codePoint < m_explFirst) {
        // The unicode value is before the range
        // that we explictly manage, so we delegate the answer.

        // If we don't have an m_before object to delegate to, make one.
        if (m_before == null) {
          m_before =
              new EncodingImpl(
                  m_encoding,
                  m_first,
                  m_explFirst - 1,
                  codePoint);
        }
        ret = m_before.isInEncoding(high, low);
      } else if (m_explLast < codePoint) {
        // The unicode value is after the range
        // that we explictly manage, so we delegate the answer.

        // If we don't have an m_after object to delegate to, make one.
        if (m_after == null) {
          m_after =
              new EncodingImpl(
                  m_encoding,
                  m_explLast + 1,
                  m_last,
                  codePoint);
        }
        ret = m_after.isInEncoding(high, low);
      } else {
        // The unicode value is in the range we explitly handle
        final int idx = codePoint - m_explFirst;

        // If we already know the answer, just return it.
        if (m_alreadyKnown[idx]) {
          ret = m_isInEncoding[idx];
        } else {
          // We don't know the answer, so find out,
          // which may be expensive, then cache the answer
          ret = inEncoding(high, low, m_encoding);
          m_alreadyKnown[idx] = true;
          m_isInEncoding[idx] = ret;
        }
      }
      return ret;
    }

    /**
     * The encoding.
     */
    final private String m_encoding;
    /**
     * m_first through m_last is the range of unicode
     * values that this object will return an answer on.
     * It may delegate to a similar object with a different
     * range
     */
    final private int m_first;

    /**
     * m_explFirst through m_explLast is the range of unicode
     * value that this object handles explicitly and does not
     * delegate to a similar object.
     */
    final private int m_explFirst;
    final private int m_explLast;
    final private int m_last;

    /**
     * The object, of the same type as this one,
     * that handles unicode values in a range before
     * the range explictly handled by this object, and
     * to which this object may delegate.
     */
    private InEncoding m_before;
    /**
     * The object, of the same type as this one,
     * that handles unicode values in a range after
     * the range explictly handled by this object, and
     * to which this object may delegate.
     */
    private InEncoding m_after;

    /**
     * The number of unicode values explicitly handled
     * by a single EncodingInfo object. This value is
     * tuneable, but is set to 128 because that covers the
     * entire low range of ASCII type chars within a single
     * object.
     */
    private static final int RANGE = 128;

    /**
     * A flag to record if we already know the answer
     * for the given unicode value.
     */
    final private boolean m_alreadyKnown[] = new boolean[RANGE];
    /**
     * A table holding the answer on whether the given unicode
     * value is in the encoding.
     */
    final private boolean m_isInEncoding[] = new boolean[RANGE];

    private EncodingImpl() {
      // This object will answer whether any unicode value
      // is in the encoding, it handles values 0 through Integer.MAX_VALUE
      this(javaName, 0, Integer.MAX_VALUE, (char) 0);
    }

    private EncodingImpl(String encoding, int first, int last, int codePoint) {
      // Set the range of unicode values that this object manages
      // either explicitly or implicitly.
      m_first = first;
      m_last = last;

      // Set the range of unicode values that this object
      // explicitly manages. Align the explicitly managed values
      // to RANGE so multiple EncodingImpl objects dont manage the same
      // values.
      m_explFirst = codePoint / RANGE * RANGE;
      m_explLast = m_explFirst + (RANGE - 1);

      m_encoding = encoding;

      if (javaName != null) {
        // Some optimization.
        if (0 <= m_explFirst && m_explFirst <= 127) {
          // This particular EncodingImpl explicitly handles
          // characters in the low range.
          if ("UTF8".equals(javaName)
              || "UTF-16".equals(javaName)
              || "ASCII".equals(javaName)
              || "US-ASCII".equals(javaName)
              || "Unicode".equals(javaName)
              || "UNICODE".equals(javaName)
              || javaName.startsWith("ISO8859")) {

            // Not only does this EncodingImpl object explicitly
            // handle chracters in the low range, it is
            // also one that we know something about, without
            // needing to call inEncoding(char ch, String encoding)
            // for this low range
            //
            // By initializing the table ahead of time
            // for these low values, we prevent the expensive
            // inEncoding(char ch, String encoding)
            // from being called, at least for these common
            // encodings.
            for (int unicode = 1; unicode < 127; unicode++) {
              final int idx = unicode - m_explFirst;
              if (0 <= idx && idx < RANGE) {
                m_alreadyKnown[idx] = true;
                m_isInEncoding[idx] = true;
              }
            }
          }
        }

                /* A little bit more than optimization.
                 *
                 * We will say that any character is in the encoding if
                 * we don't have an encoding.
                 * This is meaningful when the serializer is being used
                 * in temporary output state, where we are not writing to
                 * the final output tree.  It is when writing to the
                 * final output tree that we need to worry about the output
                 * encoding
                 */
        if (javaName == null) {
          for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
            m_alreadyKnown[idx] = true;
            m_isInEncoding[idx] = true;
          }
        }
      }
    }
  }

  /**
   * This is heart of the code that determines if a given character
   * is in the given encoding. This method is probably expensive,
   * and the answer should be cached.
   * <p>
   * This method is not a public API,
   * and should only be used internally within the serializer.
   *
   * @param ch the char in question, that is not a high char of a high/low surrogate pair.
   * @param encoding the Java name of the enocding.
   * @xsl.usage internal
   */
  private static boolean inEncoding(char ch, String encoding) {
    boolean isInEncoding;
    try {
      char cArray[] = new char[1];
      cArray[0] = ch;
      // Construct a String from the char
      String s = new String(cArray);
      // Encode the String into a sequence of bytes
      // using the given, named charset.
      byte[] bArray = s.getBytes(encoding);
      isInEncoding = inEncoding(ch, bArray);

    } catch (Exception e) {
      isInEncoding = false;

      // If for some reason the encoding is null, e.g.
      // for a temporary result tree, we should just
      // say that every character is in the encoding.
      if (encoding == null) {
        isInEncoding = true;
      }
    }
    return isInEncoding;
  }

  /**
   * This is heart of the code that determines if a given high/low
   * surrogate pair forms a character that is in the given encoding.
   * This method is probably expensive, and the answer should be cached.
   * <p>
   * This method is not a public API,
   * and should only be used internally within the serializer.
   *
   * @param high the high char of a high/low surrogate pair.
   * @param low the low char of a high/low surrogate pair.
   * @param encoding the Java name of the encoding.
   * @xsl.usage internal
   */
  private static boolean inEncoding(char high, char low, String encoding) {
    boolean isInEncoding;
    try {
      char cArray[] = new char[2];
      cArray[0] = high;
      cArray[1] = low;
      // Construct a String from the char
      String s = new String(cArray);
      // Encode the String into a sequence of bytes
      // using the given, named charset.
      byte[] bArray = s.getBytes(encoding);
      isInEncoding = inEncoding(high, bArray);
    } catch (Exception e) {
      isInEncoding = false;
    }

    return isInEncoding;
  }

  /**
   * This method is the core of determining if character
   * is in the encoding. The method is not foolproof, because
   * s.getBytes(encoding) has specified behavior only if the
   * characters are in the specified encoding. However this
   * method tries it's best.
   *
   * @param ch the char that was converted using getBytes, or the first char of a high/low pair that
   * was converted.
   * @param data the bytes written out by the call to s.getBytes(encoding);
   * @return true if the character is in the encoding.
   */
  private static boolean inEncoding(char ch, byte[] data) {
    final boolean isInEncoding;
    // If the string written out as data is not in the encoding,
    // the output is not specified according to the documentation
    // on the String.getBytes(encoding) method,
    // but we do our best here.
    if (data == null || data.length == 0) {
      isInEncoding = false;
    } else {
      if (data[0] == 0) {
        isInEncoding = false;
      } else if (data[0] == '?' && ch != '?') {
        isInEncoding = false;
      }
            /*
             * else if (isJapanese) {
             *   // isJapanese is really
             *   //   (    "EUC-JP".equals(javaName)
             *   //    ||  "EUC_JP".equals(javaName)
             *  //     ||  "SJIS".equals(javaName)   )
             *
             *   // Work around some bugs in JRE for Japanese
             *   if(data[0] == 0x21)
             *     isInEncoding = false;
             *   else if (ch == 0xA5)
             *     isInEncoding = false;
             *   else
             *     isInEncoding = true;
             * }
             */

      else {
        // We don't know for sure, but it looks like it is in the encoding
        isInEncoding = true;
      }
    }
    return isInEncoding;
  }

}
